years <- 2009:2013
dat_7 <- vector(mode = "list", length = length(years))
keep_cols_7 <- c("EIN", "TAXYR", "F9_07_COMP_DTK_TITLE", "first_name", "suffix", "status", "gender", "gender_confidence", "F9_07_COMP_DTK_COMP_ORG", "F9_07_COMP_DTK_EMPL_BEN", "F9_07_COMP_DTK_COMP_OTH")
for(i in 1:length(years)){
link <- paste0("https://nccs-efile.s3.us-east-1.amazonaws.com/partvii/PARTVII-", years[i], "-PEOPLEPARSED.csv")
temp <- fread(link, select = keep_cols_7)
colnames(temp) <- c("EIN", "year", "title", "first_name", "suffix", "status", "gender", "gender_confidence", "comp_main", "comp_benefit", "comp_other" )
dat_7[[i]] <- temp
}
dat_all_7 <- rbindlist(dat_7)
dat_all_7 <-
dat_all_7 %>%
filter(year <= max(years))
dat_1 <- vector(mode = "list", length = length(years))
keep_cols_1 <- c("ORG_EIN", "TAX_YEAR", "F9_01_ACT_GVRN_NUM_VOTE_MEMB", "F9_01_ACT_GVRN_NUM_VOTE_MEMB_IND")
for(i in 1:length(years)){
link <- paste0("https://nccs-efile.s3.us-east-1.amazonaws.com/parsed/F9-P01-T00-SUMMARY-", years[i], ".csv")
temp <- fread(link, select = keep_cols_1)
colnames(temp) <- c("EIN", "year", "num_vote_memb", "num_vote_memb_ind")
dat_1[[i]] <- temp
}
dat_all_1 <- rbindlist(dat_1)
dat_all_1 <-
dat_all_1 %>%
filter(year <= 2013)
dat_all_7 %>%
group_by(EIN, year) %>% ## Try grouping by URL
summarise(num_members = n()) %>%
ungroup %>%
group_by(year) %>%
summarise(avg_num_members = mean(num_members, na.rm=TRUE),
sd_num_members = sd(num_members, na.rm= TRUE))
## `summarise()` has grouped output by 'EIN'. You can override using the `.groups`
## argument.
## # A tibble: 5 × 3
## year avg_num_members sd_num_members
## <int> <dbl> <dbl>
## 1 2009 14.5 15.2
## 2 2010 11.2 12.5
## 3 2011 11.0 13.7
## 4 2012 10.8 13.1
## 5 2013 10.6 11.7
dat_all_7 %>%
group_by(EIN, year) %>%
summarise(num_members = n()) %>%
ungroup %>%
group_by(year) %>%
ggplot(aes(x = num_members, y = after_stat(count),
group = as.factor(year), color = as.factor(year) )) +
geom_density(adjust = 2) +
xlim(0, 50) +
ggtitle("Number of Voting Members Listed on Part VII") +
guides(color= guide_legend(title="Year"))+
xlab("Number of Members") +
ylab("Frequency")
## `summarise()` has grouped output by 'EIN'. You can override using the `.groups`
## argument.
## Warning: Removed 8916 rows containing non-finite values (`stat_density()`).
#~34% of "Number of Members" are NA
mean(is.na(dat_all_1$num_vote_memb))
## [1] 0.3406153
#~34% of "Number of Independent Members" are NA
mean(is.na(dat_all_1$num_vote_memb_ind))
## [1] 0.3406153
dat_all_1 %>%
group_by(year) %>%
summarise(avg_num_members = mean(num_vote_memb, na.rm=TRUE),
sd_num_members = sd(num_vote_memb, na.rm= TRUE),
avg_num_members_indep = mean(num_vote_memb_ind, na.rm=TRUE),
sd_num_members_indep = sd(num_vote_memb_ind, na.rm= TRUE))
## # A tibble: 5 × 5
## year avg_num_members sd_num_members avg_num_members_indep
## <int> <dbl> <dbl> <dbl>
## 1 2009 17.7 79.2 17.2
## 2 2010 21.4 318. 20.1
## 3 2011 20.5 294. 19.2
## 4 2012 20.7 403. 18.6
## 5 2013 19.4 250. 19.9
## # ℹ 1 more variable: sd_num_members_indep <dbl>
These numbers do not agree with the PART VII summary statistics.
dat_all_1 %>%
na.omit() %>%
pivot_longer(!c(EIN, year), names_to = "type", values_to = "num") %>%
ggplot(aes(x = num, y = ..count..,
color = as.factor(type))) +
geom_density(adjust = 2) +
facet_wrap(~year) +
xlim(0, 50) +
ggtitle("Number of Voting Members Listed on Part I") +
guides(color=guide_legend(title="Type")) +
xlab("Number of Members") +
ylab("Frequency")
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 29105 rows containing non-finite values (`stat_density()`).
dat_all_1 %>%
na.omit() %>%
filter(num_vote_memb < 100,
num_vote_memb_ind < 100) %>%
ggplot(aes(x = num_vote_memb, y = num_vote_memb_ind )) +
geom_point(alpha = 0.05) +
facet_wrap(~year) +
ggtitle("Voting Members vs. Independent Voting Members") +
xlab("Number of Voting Members") +
ylab("Number of Independent Voting Members")
#get number of members by org by year
by_ein_year_7 <-
dat_all_7 %>%
group_by(EIN, year) %>%
summarise(num_members = n())
## `summarise()` has grouped output by 'EIN'. You can override using the `.groups`
## argument.
#get sum of compensation for each person per year
num_zeros <-
dat_all_7 %>%
rowwise() %>%
mutate(comp_all = sum(comp_main, comp_benefit, comp_other, na.rm = TRUE))
#mark if they are zero or not
num_zeros$comp_all_0 <- num_zeros$comp_all == 0
dat_comp <-
num_zeros %>%
filter(!comp_all_0) %>%
group_by(EIN, year) %>%
summarise(num_members_comp = n(),
avg_comp = mean(comp_all)) %>%
left_join(by_ein_year_7[, c("EIN", "year", "num_members")] )
## `summarise()` has grouped output by 'EIN'. You can override using the `.groups`
## argument.
## Joining with `by = join_by(EIN, year)`
#number of compensated people per year
dat_comp %>%
filter(num_members < 101) %>%
ggplot(aes(x = num_members_comp, y = avg_comp, color = num_members)) +
geom_point(alpha = 0.25)+
facet_wrap(~year) +
xlim(0, 100) +
ylim(0, 1000000) +
ggtitle("Number of Compensated members Vs. Average Compensation") +
xlab("Number of Compensated Members") +
ylab("Average Total Compensation of Compensated Members")+
scale_color_viridis()
## Warning: Removed 257 rows containing missing values (`geom_point()`).
#number of board member vs number of compensated board members
dat_comp %>%
filter(avg_comp < 1000001) %>%
ggplot(aes(x = num_members, y = num_members_comp, color = avg_comp)) +
geom_point(alpha = 0.25)+
facet_wrap(~year) +
xlim(0, 100) +
ylim(0, 100) +
ggtitle("Number of Members Vs. Number of Compensated Members") +
xlab("Number of Members") +
ylab("Number of Compensated Members")+
scale_color_viridis()
## Warning: Removed 1046 rows containing missing values (`geom_point()`).
### Change in Number of Board Members over the years
## only keeping the orgs that have data in every year
num_per_year <-
by_ein_year_7 %>%
pivot_wider(names_from = year, values_from = num_members) %>%
na.omit() %>%
relocate(EIN, all_of(as.character(years)))
#get 1 year difference, set year 1 at 0 for baseline
diff_by_year <- data.frame(EIN = num_per_year$EIN)
diff_by_year[, 2] <- 0
colnames(diff_by_year)[2] <- as.character(years[1])
for(i in 2:length(years)){
diff_by_year[, i+1] <- num_per_year[, i+1] - num_per_year[, i]
}
diff_by_year %>%
pivot_longer(!EIN, names_to = "year", values_to = "diff") %>%
ggplot(aes(y = diff, x = year)) +
geom_boxplot(outlier.shape = NA) +
ylim(-5, 5) +
ggtitle("Differnce in Number of Board Members From Year To Year") +
ylab("difference from the previous year")
## Warning: Removed 7209 rows containing non-finite values (`stat_boxplot()`).
diff_by_year %>%
pivot_longer(!EIN, names_to = "year", values_to = "diff") %>%
filter(year != 2009) %>%
ggplot(aes(x = diff)) +
geom_histogram(bins = 20) +
xlim(-10, 10) +
facet_wrap(~year) +
ggtitle("Differnce in Number of Board Members From Year To Year") +
xlab("difference from the previous year")
## Warning: Removed 2787 rows containing non-finite values (`stat_bin()`).
## Warning: Removed 4 rows containing missing values (`geom_bar()`).
#Percentage Change by year
perc_by_year <- data.frame(EIN = num_per_year$EIN)
perc_by_year[, 2] <- 0
colnames(perc_by_year)[2] <- as.character(years[1])
for(i in 2:length(years)){
perc_by_year[, i+1] <- (num_per_year[, i+1] - num_per_year[, i]) / num_per_year[, i] * 100
}
perc_by_year %>%
pivot_longer(!EIN, names_to = "year", values_to = "perc") %>%
ggplot(aes(y = perc, x = year)) +
geom_boxplot(outlier.shape = NA) +
ylim(-25, 25) +
ggtitle("Percentage Differnce in Number of Board Members From Year To Year") +
ylab("precentage difference from the previous year")
## Warning: Removed 12585 rows containing non-finite values (`stat_boxplot()`).
perc_by_year %>%
pivot_longer(!EIN, names_to = "year", values_to = "perc") %>%
filter(year != 2009) %>%
ggplot(aes(x = perc)) +
geom_histogram(bins = 20) +
xlim(-50, 50) +
facet_wrap(~year) +
ggtitle("Percentage Difference in Number of Board Members From Year To Year") +
xlab("percentage difference from the previous year")
## Warning: Removed 4174 rows containing non-finite values (`stat_bin()`).
dat_all_7 %>%
group_by(EIN, year, gender) %>%
summarise(num_members = n() ) %>%
ungroup %>%
group_by(year, gender) %>%
summarise(avg_num_members = mean(num_members, na.rm = TRUE)) %>%
arrange(desc(gender)) %>%
ggplot(aes(x = year, y = avg_num_members, fill = fct_inorder(gender))) +
geom_col(position="stack") +
ggtitle("Average Board Members By Gender") +
xlab("Year") +
ylab("Average Number of Members") +
guides(fill=guide_legend(title="Gender"))
## `summarise()` has grouped output by 'EIN', 'year'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
dat_all_7 %>%
group_by(EIN, year, gender) %>%
summarise(num_members = n() ) %>%
ungroup %>%
group_by(year, gender) %>%
filter(gender != "U") %>%
summarise(avg_num_members = mean(num_members, na.rm = TRUE)) %>%
arrange(desc(gender)) %>%
mutate(avg_num_members = round(avg_num_members, 2)) %>%
ggplot(aes(x = as.factor(year), y = avg_num_members, fill = fct_inorder(gender))) +
geom_col(position="fill") +
#geom_text(aes(label = avg_num_members), position = position_fill(vjust = 0.5)) +
ggtitle("Percentage of Board Members By Gender") +
xlab("Year") +
ylab("Percentage of Members") +
guides(fill=guide_legend(title="Gender"))
## `summarise()` has grouped output by 'EIN', 'year'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
dat_all_7 %>%
group_by(EIN, year, gender) %>%
summarise(num_members = n() ) %>%
ungroup %>%
group_by(year, gender) %>%
filter(gender != "U") %>%
summarise(avg_num_members = mean(num_members, na.rm = TRUE)) %>%
arrange(desc(gender)) %>%
mutate(avg_num_members = round(avg_num_members, 2)) %>%
ggplot(aes(x = as.factor(year), y = avg_num_members, fill = fct_inorder(gender))) +
geom_col(position="fill") +
#geom_text(aes(label = avg_num_members), position = position_fill(vjust = 0.5)) +
ggtitle("Percentage of Board Members By Gender") +
xlab("Year") +
ylab("Percentage of Members") +
guides(fill=guide_legend(title="Gender"))
## `summarise()` has grouped output by 'EIN', 'year'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
dat_all_7 %>%
select(EIN, year, gender) %>%
group_by(EIN, year, gender) %>%
summarise(num_members = n() ) %>%
ungroup %>%
pivot_wider(names_from = gender, values_from = num_members, values_fill = 0) %>%
rowwise() %>%
mutate(total = sum(M, U, `F`)) %>%
mutate(pM = M / total,
pU = U / total,
pF = `F` / total) %>%
group_by(year) %>%
summarise(M = mean(pM),
U = mean(pU),
`F` = mean(pF)) %>%
pivot_longer(!year, names_to = "gender", values_to = "avg_perc") %>%
mutate(avg_perc = round(avg_perc, 2)) %>%
ggplot(aes(x = as.factor(year), y = avg_perc, fill = fct_inorder(gender))) +
geom_col(position="fill") +
geom_text(aes(label = avg_perc), position = position_fill(vjust = 0.5)) +
ggtitle("Average Gener Percentage of Board Members") +
xlab("Year") +
ylab("Percentage of Members") +
guides(fill=guide_legend(title="Gender"))
## `summarise()` has grouped output by 'EIN', 'year'. You can override using the
## `.groups` argument.
dat_all_7 %>%
select(EIN, year, gender) %>%
group_by(EIN, year, gender) %>%
summarise(num_members = n() ) %>%
ungroup %>%
pivot_wider(names_from = gender, values_from = num_members, values_fill = 0) %>%
rowwise() %>%
mutate(total = sum(M, `F`)) %>%
mutate(pM = M / total,
pF = `F` / total) %>%
group_by(year) %>%
summarise(M = mean(pM, na.rm = T),
`F` = mean(pF, na.rm = T)) %>%
pivot_longer(!year, names_to = "gender", values_to = "avg_perc") %>%
mutate(avg_perc = round(avg_perc, 2)) %>%
ggplot(aes(x = as.factor(year), y = avg_perc, fill = fct_inorder(gender))) +
geom_col(position="fill") +
geom_text(aes(label = avg_perc), position = position_fill(vjust = 0.5)) +
ggtitle("Average Gener Percentage of Board Members") +
xlab("Year") +
ylab("Percentage of Members") +
guides(fill=guide_legend(title="Gender"))
## `summarise()` has grouped output by 'EIN', 'year'. You can override using the
## `.groups` argument.
##with in org gender change
dat_all_7 %>%
filter(year %in% c(2010, 2013)) %>%
filter(gender != "U") %>%
select(EIN, year, gender) %>%
group_by(EIN, year) %>%
summarise(mm = mean(gender == "M")) %>%
ungroup() %>%
pivot_wider(names_from = year, values_from = mm) %>%
na.omit() %>%
mutate(change = `2013` - `2010` ) %>%
pull(change) %>%
summary()
## `summarise()` has grouped output by 'EIN'. You can override using the `.groups`
## argument.
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.000000 -0.062500 0.000000 -0.005956 0.045455 1.000000
Conclusion: Suffix will not be helpful for “U”. All the suffix’s are JR, II, III, etc. They are not gendered.
#first look at all of the people with "U" to see if they have a suffix
dat_all_7 %>%
filter(gender == "U") %>%
filter(nchar(suffix) > 0) %>%
pull(suffix) %>%
table()
## .
## II III IV JR SENIOR SR
## 528 2537 282 4831 3 501
dat.more <-
dat_all_1 %>%
filter(num_vote_memb_ind > num_vote_memb)
dat.more %>%
ggplot(aes(x = num_vote_memb , y = num_vote_memb_ind)) +
geom_point() +
xlab("Number of Members") +
ylab("Number of Independent Members") +
ggtitle("Orgs with more independent members than members")
dat.more %>%
filter(num_vote_memb < 100) %>%
filter(num_vote_memb_ind < 500) %>%
ggplot(aes(x = num_vote_memb , y = num_vote_memb_ind)) +
geom_point() +
xlab("Number of Members") +
ylab("Number of Independent Members") +
ggtitle("Orgs with more independent members than members")
set.seed(100)
dat.more %>%
filter(num_vote_memb < 20) %>%
filter(num_vote_memb_ind < 100) %>%
sample_n(1)
## EIN year num_vote_memb num_vote_memb_ind
## 1: 201736709 2010 6 27
dat.more %>%
filter(num_vote_memb > 100) %>%
filter(num_vote_memb_ind > 200) %>%
sample_n(1)
## EIN year num_vote_memb num_vote_memb_ind
## 1: 596583560 2012 397 486
dat.more %>%
filter(num_vote_memb < 100) %>%
filter(num_vote_memb_ind > 200) %>%
sample_n(1)
## EIN year num_vote_memb num_vote_memb_ind
## 1: 710224574 2012 13 323